import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
http://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)
The file "sonar.mines" contains 111 patterns obtained by bouncing sonar signals off a metal cylinder at various angles and under various conditions. The file "sonar.rocks" contains 97 patterns obtained from rocks under similar conditions. The transmitted sonar signal is a frequency-modulated chirp, rising in frequency. The data set contains signals obtained from a variety of different aspect angles, spanning 90 degrees for the cylinder and 180 degrees for the rock.
Each pattern is a set of 60 numbers in the range 0.0 to 1.0. Each number represents the energy within a particular frequency band, integrated over a certain period of time. The integration aperture for higher frequencies occur later in time, since these frequencies are transmitted later during the chirp.
The label associated with each record contains the letter "R" if the object is a rock and "M" if it is a mine (metal cylinder). The numbers in the labels are in increasing order of aspect angle, but they do not encode the angle directly.
dataset_name = 'sonar'
file_path = '~/data/sonar/'
file_name = 'sonar.all-data'
file = file_path + file_name
df = pd.read_csv(file, header=None)
df.shape
# rename the last column to label
df.rename(columns={60: 'label'}, inplace=True)
df.info()
df.head()
df.describe()
# check dataframe for bad data
df.isnull().any().any() | df.isna().any().any()
df.isnull().sum().sum() + df.isna().sum().sum()
# convert label to numeric
# df2 = df.copy(deep=True)
# df2['label'] = np.where(df['label'] == 'M', 1, 0)
# df2.info()
# get the feature names to make plotting easier
feature_names = list(df.columns.values)
feature_names.remove('label')
# get the number of samples for each label
num_rocks = df[df['label'] == 'R'].shape[0]
num_mines = df[df['label'] == 'M'].shape[0]
print('Samples with rocks: ', num_rocks)
print('Samples with mines: ', num_mines)
sns.countplot(x='label', data=df)
plt.savefig(dataset_name + '_countplot.png')
plt.show()
num_plots = len(feature_names)
num_columns = 4
num_rows = int(num_plots/num_columns + num_plots%num_columns) # round up
x_value = 'label'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,62))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.boxplot(x=x_value, y=name, data=df, ax=axs[index], boxprops=dict(alpha=.9))
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.boxplot(x=x_value, y=name, data=df, ax=axs[row][col], boxprops=dict(alpha=.9))
plt.tight_layout()
plt.savefig(dataset_name + '_boxplots.png')
plt.show()
num_plots = len(feature_names)
num_columns = 4
num_rows = int(num_plots/num_columns + num_plots%num_columns) # round up
x_value = 'label'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,62))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.violinplot(x=x_value, y=name, data=df, ax=axs[index])
sns.swarmplot(x=x_value, y=name, data=df, ax=axs[index], color='k', size=3);
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.violinplot(x=x_value, y=name, data=df, ax=axs[row][col])
sns.swarmplot(x=x_value, y=name, data=df, ax=axs[row][col], color='k', size=3);
plt.tight_layout()
plt.savefig(dataset_name + '_violinplots.png')
plt.show()
num_plots = len(feature_names)
num_columns = 4
num_rows = int(num_plots/num_columns + num_plots%num_columns) # round up
label = 'label'
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,52))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.distplot(df[df[label]=='R'][name], ax=axs[index])
sns.distplot(df[df[label]=='M'][name], ax=axs[index])
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.distplot(df[df[label]=='R'][name], ax=axs[row][col])
sns.distplot(df[df[label]=='M'][name], ax=axs[row][col])
plt.legend(['rock','mine'])
plt.savefig(dataset_name + '_distplots_by_label.png')
plt.show()
# distribution plot
num_plots = len(feature_names)
num_columns = 4
num_rows = int(num_plots/num_columns + num_plots%num_columns) # round up
fig, axs = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(16,52))
if num_rows == 1:
for index, name in enumerate(feature_names):
sns.distplot(df[name], ax=axs[index])
else:
for index, name in enumerate(feature_names):
row = int(index / num_columns)
col = index % num_columns
sns.distplot(df[name], ax=axs[row][col])
plt.savefig(dataset_name + '_distplots.png')
plt.show()
# compute pairwise correlation of the attributes
corr = df.corr()
corr
fig, (ax) = plt.subplots(1, 1, figsize=(20,20))
hm = sns.heatmap(corr,
ax=ax, # Axes in which to draw the plot, otherwise use the currently-active Axes.
cmap="coolwarm", # Color Map.
#square=True, # If True, set the Axes aspect to “equal” so each cell will be square-shaped.
annot=True,
fmt='.2f', # String formatting code to use when adding annotations.
#annot_kws={"size": 14},
linewidths=.05)
fig.subplots_adjust(top=0.93)
fig.suptitle(dataset_name + ' attributes correlation heatmap', fontsize=14, fontweight='bold')
plt.savefig(dataset_name + '_heatmap.png')
plt.show()